intact_annot <- readRDS("/data1/gdouglas/projects/accessory_vs_pseudogene/summary/intact_and_both_cluster_annot.rds")
intact_annot <- intact_annot[-which(intact_annot$COG_category == "-"), ]
intact_annot <- intact_annot[-which(intact_annot$COG_category == ""), ]
# For intact clusters, filter out clusters outside size range (at least based on what clusters they're in).
cluster_filt_lengths <- read.table(file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_filt_lengths_and_additional.tsv.gz",
header = TRUE, sep = "\t", row.names = 1)
intact_annot <- intact_annot[which(rownames(intact_annot) %in% rownames(cluster_filt_lengths)), ]
pseudo_annot <- readRDS("/data1/gdouglas/projects/accessory_vs_pseudogene/summary/intergenic_pseudo_cluster_COG_annot.rds")
pseudo_annot <- pseudo_annot$pseudo
pseudo_annot <- pseudo_annot[-which(pseudo_annot$majority_COG_category == ""), ]
rownames(pseudo_annot) <- pseudo_annot$cluster
pseudo_annot <- pseudo_annot[, c(2, 3)]
colnames(pseudo_annot) <- c("COG", "COG_category")
intact_annot <- intact_annot[, c("all_COG", "COG_category")]
colnames(intact_annot) <- c("COG", "COG_category")
COG_category_annot <- rbind(intact_annot, pseudo_annot)
cluster_pangenome_categories <- readRDS("/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_pangenome_categories.rds")
# Read in COG category db info.
COG_category_descrip <- read.table("/data1/gdouglas/db/COG_definitions/COG_category_descrip.tsv",
header = FALSE, sep="\t", stringsAsFactors = FALSE, row.names = 1)
# Ignore several COG categories (eukaryote-associated primarily)
COG_category_descrip <- COG_category_descrip[which(! rownames(COG_category_descrip) %in% c("A", "B", "Y", "Z")), , drop = FALSE]
prep_heatmap_input <- function(clusters_to_include) {
# Subset to specific clusters only.
cluster_pangenome_categories_subset <- cluster_pangenome_categories
for (sp in names(cluster_pangenome_categories_subset)) {
for (type in names(cluster_pangenome_categories_subset[[sp]])) {
for (partition in names(cluster_pangenome_categories_subset[[sp]][[type]])) {
cluster_pangenome_categories_subset[[sp]][[type]][[partition]] <- intersect(cluster_pangenome_categories_subset[[sp]][[type]][[partition]],
clusters_to_include)
}
}
}
all_species <- names(cluster_pangenome_categories_subset)
pangenome_category_breakdown <- data.frame(matrix(NA, nrow = length(all_species), ncol = 12))
rownames(pangenome_category_breakdown) <- all_species
colnames(pangenome_category_breakdown) <- c("intact_ultra.cloud", "intact_other.cloud", "intact_shell", "intact_soft.core",
"both_ultra.cloud", "both_other.cloud", "both_shell", "both_soft.core",
"pseudo_ultra.cloud", "pseudo_other.cloud", "pseudo_shell", "pseudo_soft.core")
for (sp in all_species) {
for (type in c("intact", "both", "pseudo")) {
for (partition in c("ultra.cloud", "other.cloud", "shell", "soft.core")) {
column_name <- paste(type, partition, sep = "_")
pangenome_category_breakdown[sp, column_name] <- length(cluster_pangenome_categories_subset[[sp]][[type]][[partition]])
}
}
}
pangenome_category_breakdown_rel.by.category <- pangenome_category_breakdown
pangenome_category_breakdown_rel.by.pangenome <- pangenome_category_breakdown
pseudo_colnames <- paste("pseudo", c("ultra.cloud", "other.cloud", "shell", "soft.core"), sep = "_")
intact_colnames <- paste("intact", c("ultra.cloud", "other.cloud", "shell", "soft.core"), sep = "_")
mixed_colnames <- paste("both", c("ultra.cloud", "other.cloud", "shell", "soft.core"), sep = "_")
ultra.cloud_colnames <- paste(c("intact", "both", "pseudo"), "ultra.cloud", sep = "_")
other.cloud_colnames <- paste(c("intact", "both", "pseudo"), "other.cloud", sep = "_")
shell_colnames <- paste(c("intact", "both", "pseudo"), "shell", sep = "_")
soft.core_colnames <- paste(c("intact", "both", "pseudo"), "soft.core", sep = "_")
for (sp in all_species) {
sp_total_pseudo <- sum(pangenome_category_breakdown[sp, pseudo_colnames])
sp_total_intact <- sum(pangenome_category_breakdown[sp, intact_colnames])
sp_total_mixed <- sum(pangenome_category_breakdown[sp, mixed_colnames])
sp_total_ultra.cloud <- sum(pangenome_category_breakdown[sp, ultra.cloud_colnames])
sp_total_other.cloud <- sum(pangenome_category_breakdown[sp, other.cloud_colnames])
sp_total_shell <- sum(pangenome_category_breakdown[sp, shell_colnames])
sp_total_soft.core <- sum(pangenome_category_breakdown[sp, soft.core_colnames])
pangenome_category_breakdown_rel.by.category[sp, pseudo_colnames] <- (pangenome_category_breakdown_rel.by.category[sp, pseudo_colnames] / sp_total_pseudo) * 100
pangenome_category_breakdown_rel.by.category[sp, intact_colnames] <- (pangenome_category_breakdown_rel.by.category[sp, intact_colnames] / sp_total_intact) * 100
pangenome_category_breakdown_rel.by.category[sp, mixed_colnames] <- (pangenome_category_breakdown_rel.by.category[sp, mixed_colnames] / sp_total_mixed) * 100
pangenome_category_breakdown_rel.by.pangenome[sp, ultra.cloud_colnames] <- (pangenome_category_breakdown_rel.by.pangenome[sp, ultra.cloud_colnames] / sp_total_ultra.cloud) * 100
pangenome_category_breakdown_rel.by.pangenome[sp, other.cloud_colnames] <- (pangenome_category_breakdown_rel.by.pangenome[sp, other.cloud_colnames] / sp_total_other.cloud) * 100
pangenome_category_breakdown_rel.by.pangenome[sp, shell_colnames] <- (pangenome_category_breakdown_rel.by.pangenome[sp, shell_colnames] / sp_total_shell) * 100
pangenome_category_breakdown_rel.by.pangenome[sp, soft.core_colnames] <- (pangenome_category_breakdown_rel.by.pangenome[sp, soft.core_colnames] / sp_total_soft.core) * 100
}
pangenome_category_breakdown_rel.by.category <- round(pangenome_category_breakdown_rel.by.category, 1)
pangenome_category_breakdown_rel.by.pangenome <- round(pangenome_category_breakdown_rel.by.pangenome, 1)
pangenome_category_breakdown_rel.by.pangenome <- pangenome_category_breakdown_rel.by.pangenome[, c(ultra.cloud_colnames, other.cloud_colnames, shell_colnames, soft.core_colnames)]
pangenome_category_breakdown_by.category <- pangenome_category_breakdown[, colnames(pangenome_category_breakdown_rel.by.category)]
pangenome_category_breakdown_by.pangenome <- pangenome_category_breakdown[, colnames(pangenome_category_breakdown_rel.by.pangenome)]
# Get text to show in heatmaps
pangenome_category_breakdown_rel.by.category_char <- apply(pangenome_category_breakdown_rel.by.category, 2, function(x) { paste(x, "%\n", sep = "") })
pangenome_category_breakdown_by.category_char <- apply(pangenome_category_breakdown_by.category, 2, function(x) { paste("(", x, ")", sep = "") })
for (i in 1:ncol(pangenome_category_breakdown_rel.by.category_char)) {
pangenome_category_breakdown_rel.by.category_char[, i] <- paste(pangenome_category_breakdown_rel.by.category_char[, i],
pangenome_category_breakdown_by.category_char[, i], sep = "")
}
pangenome_category_breakdown_rel.by.pangenome_char <- apply(pangenome_category_breakdown_rel.by.pangenome, 2, function(x) { paste(x, "%\n", sep = "") })
pangenome_category_breakdown_by.pangenome_char <- apply(pangenome_category_breakdown_by.pangenome, 2, function(x) { paste("(", x, ")", sep = "") })
for (i in 1:ncol(pangenome_category_breakdown_rel.by.pangenome_char)) {
pangenome_category_breakdown_rel.by.pangenome_char[, i] <- paste(pangenome_category_breakdown_rel.by.pangenome_char[, i],
pangenome_category_breakdown_by.pangenome_char[, i], sep = "")
}
return(list(by_category = pangenome_category_breakdown_rel.by.category,
by_category_fill = pangenome_category_breakdown_rel.by.category_char,
by_pangenome = pangenome_category_breakdown_rel.by.pangenome,
by_pangenome_fill = pangenome_category_breakdown_rel.by.pangenome_char))
}
all_clusters <- as.character()
for (sp in names(cluster_pangenome_categories)) {
for (type in names(cluster_pangenome_categories[[sp]])) {
for (partition in names(cluster_pangenome_categories[[sp]][[type]])) {
all_clusters <- c(all_clusters,
cluster_pangenome_categories[[sp]][[type]][[partition]])
}
}
}
all_clusters <- unique(all_clusters)
overall_prepped_files <- prep_heatmap_input(all_clusters)
# Save tables as files for future reference.
write.table(x = overall_prepped_files$by_category,
file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_breakdown_tables/all.clusters_percent_by_type.tsv",
col.names = NA, row.names = TRUE, sep = "\t", quote = FALSE)
by_category_fill_to_write <- gsub("\n", " ", overall_prepped_files$by_category_fill)
rownames(by_category_fill_to_write) <- rownames(overall_prepped_files$by_category)
write.table(x = by_category_fill_to_write,
file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_breakdown_tables/all.clusters_percent_by_type_fill.tsv",
col.names = NA, row.names = TRUE, sep = "\t", quote = FALSE)
write.table(x = overall_prepped_files$by_pangenome,
file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_breakdown_tables/all.clusters_percent_by_partition.tsv",
col.names = NA, row.names = TRUE, sep = "\t", quote = FALSE)
by_pangenome_fill_to_write <- gsub("\n", " ", overall_prepped_files$by_pangenome_fill)
rownames(by_pangenome_fill_to_write) <- rownames(overall_prepped_files$by_pangenome)
write.table(x = by_pangenome_fill_to_write,
file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_breakdown_tables/all.clusters_percent_by_partition_fill.tsv",
col.names = NA, row.names = TRUE, sep = "\t", quote = FALSE)
draw(column_title = "Cluster breakdown by cluster type, for all clusters (including those lacking COG category annotation)",
Heatmap(matrix = as.matrix(overall_prepped_files$by_category),
name = "Percent",
row_names_side = "left",
row_labels = gt_render(paste("*", gsub("_", " ", rownames(overall_prepped_files$by_category)), "*", sep = "")),
column_labels = c("Ultra-cloud", "Other-cloud", "Shell", "Soft-core", "Ultra-cloud", "Other-cloud", "Shell", "Soft-core", "Ultra-cloud", "Other-cloud", "Shell", "Soft-core"),
column_split = c("Intact", "Intact", "Intact", "Intact", "Mixed", "Mixed", "Mixed", "Mixed", "Pseudogene", "Pseudogene", "Pseudogene", "Pseudogene"),
cluster_rows = FALSE,
column_gap = unit(5, "mm"),
cluster_columns = FALSE,
column_names_rot = 45,
col = colorRamp2(c(0, 50, 100), c("slateblue1","white", "red")),
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(overall_prepped_files$by_category_fill[i, j], x, y, gp = gpar(fontsize = 10, fontface="bold"))
}))
draw(column_title = "Cluster breakdown by pangenome partition, for all clusters (including unannotated)",
Heatmap(matrix = as.matrix(overall_prepped_files$by_pangenome),
name = "Percent",
column_labels = c("Intact", "Mixed", "Pseudogene", "Intact", "Mixed", "Pseudogene", "Intact", "Mixed", "Pseudogene", "Intact", "Mixed", "Pseudogene"),
row_names_side = "left",
row_labels = gt_render(paste("*", gsub("_", " ", rownames(overall_prepped_files$by_pangenome)), "*", sep = "")),
column_split = c("a) Ultra-cloud", "a) Ultra-cloud", "a) Ultra-cloud", "b) Other-cloud", "b) Other-cloud", "b) Other-cloud", "c) Shell", "c) Shell", "c) Shell", "d) Soft-core", "d) Soft-core", "d) Soft-core"),
cluster_rows = FALSE,
column_gap = unit(5, "mm"),
cluster_columns = FALSE,
column_names_rot = 45,
col = colorRamp2(c(0, 50, 100), c("slateblue1","white", "red")),
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(overall_prepped_files$by_pangenome_fill[i, j], x, y, gp = gpar(fontsize = 10, fontface="bold"))
})
)
unannot_clusters <- all_clusters[which(! all_clusters %in% rownames(COG_category_annot))]
all_species <- names(cluster_pangenome_categories)
pangenome_category_breakdown_unannot <- data.frame(matrix(NA, nrow = length(all_species), ncol = 12))
rownames(pangenome_category_breakdown_unannot) <- all_species
colnames(pangenome_category_breakdown_unannot) <- c("intact_ultra.cloud", "intact_other.cloud", "intact_shell", "intact_soft.core",
"both_ultra.cloud", "both_other.cloud", "both_shell", "both_soft.core",
"pseudo_ultra.cloud", "pseudo_other.cloud", "pseudo_shell", "pseudo_soft.core")
pangenome_category_breakdown_unannot_percent <- pangenome_category_breakdown_unannot
for (sp in all_species) {
for (type in c("intact", "both", "pseudo")) {
for (partition in c("ultra.cloud", "other.cloud", "shell", "soft.core")) {
column_name <- paste(type, partition, sep = "_")
pangenome_category_breakdown_unannot[sp, column_name] <- length(which(cluster_pangenome_categories[[sp]][[type]][[partition]] %in% unannot_clusters))
pangenome_category_breakdown_unannot_percent[sp, column_name] <- (pangenome_category_breakdown_unannot[sp, column_name] /
length(cluster_pangenome_categories[[sp]][[type]][[partition]])) * 100
}
}
}
pangenome_category_breakdown_unannot_percent_char <- apply(pangenome_category_breakdown_unannot_percent, 2, function(x) { sprintf('%.1f', x) })
pangenome_category_breakdown_unannot_char <- apply(pangenome_category_breakdown_unannot, 2, as.character)
pangenome_category_breakdown_unannot_heatmap_fill <- pangenome_category_breakdown_unannot_percent_char
for (i in 1:nrow(pangenome_category_breakdown_unannot_heatmap_fill)) {
for(j in 1:ncol(pangenome_category_breakdown_unannot_heatmap_fill)) {
pangenome_category_breakdown_unannot_heatmap_fill[i, j] <- paste(pangenome_category_breakdown_unannot_percent_char[i, j],
"%\n(",
pangenome_category_breakdown_unannot_char[i, j],
")",
sep = "")
}
}
draw(column_title = "Percent of clusters without COG annotation (out of total for that cluster type, species, and pangenome partition)",
Heatmap(matrix = as.matrix(pangenome_category_breakdown_unannot_percent),
name = "Percent",
row_names_side = "left",
row_labels = gt_render(paste("*", gsub("_", " ", rownames(pangenome_category_breakdown_unannot_percent)), "*", sep = "")),
column_labels = c("Ultra-cloud", "Other-cloud", "Shell", "Soft-core", "Ultra-cloud", "Other-cloud", "Shell", "Soft-core", "Ultra-cloud", "Other-cloud", "Shell", "Soft-core"),
column_split = c("Intact", "Intact", "Intact", "Intact", "Mixed", "Mixed", "Mixed", "Mixed", "Pseudogene", "Pseudogene", "Pseudogene", "Pseudogene"),
cluster_rows = FALSE,
cluster_columns = FALSE,
column_names_rot = 45,
column_gap = unit(5, "mm"),
col = colorRamp2(c(0, 50, 100), c("slateblue1","white", "red")),
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(pangenome_category_breakdown_unannot_heatmap_fill[i, j], x, y, gp = gpar(fontsize = 10, fontface="bold"))
})
)
# Save these tables too:
write.table(x = pangenome_category_breakdown_unannot_percent,
file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_breakdown_tables/unannot_percent_by_table.cell.tsv",
col.names = NA, row.names = TRUE, sep = "\t", quote = FALSE)
pangenome_category_breakdown_unannot_heatmap_fill_to_write <- gsub("\n", " ", pangenome_category_breakdown_unannot_heatmap_fill)
rownames(pangenome_category_breakdown_unannot_heatmap_fill_to_write) <- rownames(pangenome_category_breakdown_unannot_percent)
write.table(x = pangenome_category_breakdown_unannot_heatmap_fill_to_write,
file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_breakdown_tables/unannot_percent_by_table.cell_fill.tsv",
col.names = NA, row.names = TRUE, sep = "\t", quote = FALSE)
annot_prepped_files <- prep_heatmap_input(rownames(COG_category_annot))
# Save tables as files for future reference.
write.table(x = annot_prepped_files$by_category,
file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_breakdown_tables/cog.category.annot_percent_by_type.tsv",
col.names = NA, row.names = TRUE, sep = "\t", quote = FALSE)
annot.only_by_category_fill_to_write <- gsub("\n", " ", annot_prepped_files$by_category_fill)
rownames(annot.only_by_category_fill_to_write) <- rownames(annot_prepped_files$by_category)
write.table(x = annot.only_by_category_fill_to_write,
file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_breakdown_tables/cog.category.annot_percent_by_type_fill.tsv",
col.names = NA, row.names = TRUE, sep = "\t", quote = FALSE)
write.table(x = annot_prepped_files$by_pangenome,
file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_breakdown_tables/cog.category.annot_percent_by_partition.tsv",
col.names = NA, row.names = TRUE, sep = "\t", quote = FALSE)
annot.only_by_pangenome_fill_to_write <- gsub("\n", " ", annot_prepped_files$by_pangenome_fill)
rownames(annot.only_by_pangenome_fill_to_write) <- rownames(annot_prepped_files$by_pangenome)
write.table(x = annot.only_by_pangenome_fill_to_write,
file = "/data1/gdouglas/projects/accessory_vs_pseudogene/summary/cluster_breakdown_tables/cog.category.annot_percent_by_partition_fill.tsv",
col.names = NA, row.names = TRUE, sep = "\t", quote = FALSE)
draw(column_title = "Cluster breakdown by cluster type, for COG-category annotated clusters",
Heatmap(matrix = as.matrix(annot_prepped_files$by_category),
name = "Percent",
row_names_side = "left",
row_labels = gt_render(paste("*", gsub("_", " ", rownames(annot_prepped_files$by_category)), "*", sep = "")),
column_labels = c("Ultra-cloud", "Other-cloud", "Shell", "Soft-core", "Ultra-cloud", "Other-cloud", "Shell", "Soft-core", "Ultra-cloud", "Other-cloud", "Shell", "Soft-core"),
column_split = c("Intact", "Intact", "Intact", "Intact", "Mixed", "Mixed", "Mixed", "Mixed", "Pseudogene", "Pseudogene", "Pseudogene", "Pseudogene"),
cluster_rows = FALSE,
column_gap = unit(5, "mm"),
cluster_columns = FALSE,
column_names_rot = 45,
col = colorRamp2(c(0, 50, 100), c("slateblue1","white", "red")),
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(annot_prepped_files$by_category_fill[i, j], x, y, gp = gpar(fontsize = 10, fontface="bold"))
}))
draw(column_title = "Cluster breakdown by pangenome partition, for COG-category annotated clusters",
Heatmap(matrix = as.matrix(annot_prepped_files$by_pangenome),
name = "Percent",
column_labels = c("Intact", "Mixed", "Pseudogene", "Intact", "Mixed", "Pseudogene", "Intact", "Mixed", "Pseudogene", "Intact", "Mixed", "Pseudogene"),
row_names_side = "left",
row_labels = gt_render(paste("*", gsub("_", " ", rownames(annot_prepped_files$by_pangenome)), "*", sep = "")),
column_split = c("a) Ultra-cloud", "a) Ultra-cloud", "a) Ultra-cloud", "b) Other-cloud", "b) Other-cloud", "b) Other-cloud", "c) Shell", "c) Shell", "c) Shell", "d) Soft-core", "d) Soft-core", "d) Soft-core"),
cluster_rows = FALSE,
column_gap = unit(5, "mm"),
cluster_columns = FALSE,
column_names_rot = 45,
col = colorRamp2(c(0, 50, 100), c("slateblue1","white", "red")),
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(annot_prepped_files$by_pangenome_fill[i, j], x, y, gp = gpar(fontsize = 10, fontface="bold"))
})
)
Session details printed out for reproducibility.
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] gridtext_0.1.5 circlize_0.4.15 ComplexHeatmap_2.14.0
## [4] knitr_1.41 kableExtra_1.3.4
##
## loaded via a namespace (and not attached):
## [1] shape_1.4.6 GetoptLong_1.0.5 xfun_0.36
## [4] bslib_0.4.2 colorspace_2.0-3 vctrs_0.5.1
## [7] htmltools_0.5.4 stats4_4.2.2 viridisLite_0.4.1
## [10] yaml_2.3.6 rlang_1.0.6 jquerylib_0.1.4
## [13] glue_1.6.2 BiocGenerics_0.44.0 RColorBrewer_1.1-3
## [16] matrixStats_0.63.0 foreach_1.5.2 lifecycle_1.0.3
## [19] stringr_1.5.0 commonmark_1.8.1 munsell_0.5.0
## [22] rvest_1.0.3 GlobalOptions_0.1.2 codetools_0.2-18
## [25] evaluate_0.19 IRanges_2.32.0 fastmap_1.1.0
## [28] doParallel_1.0.17 parallel_4.2.2 markdown_1.4
## [31] highr_0.10 Rcpp_1.0.7 scales_1.2.1
## [34] cachem_1.0.6 S4Vectors_0.36.1 webshot_0.5.4
## [37] jsonlite_1.8.4 systemfonts_1.0.4 rjson_0.2.21
## [40] png_0.1-8 digest_0.6.31 stringi_1.7.8
## [43] clue_0.3-63 cli_3.5.0 tools_4.2.2
## [46] magrittr_2.0.3 sass_0.4.4 cluster_2.1.4
## [49] crayon_1.5.2 xml2_1.3.3 rmarkdown_2.19
## [52] svglite_2.1.0 httr_1.4.4 rstudioapi_0.14
## [55] iterators_1.0.14 R6_2.5.1 compiler_4.2.2